In [1]:
import pandas as pd
import urllib
import requests
import os
In [2]:
%matplotlib inline
In [3]:
pd.set_option('display.precision', 2)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
In [4]:
def get_receipt_data(date):
month_year = date.strftime('%m/%Y')
date_month_year = date.strftime('%d-%m-%Y')
uri = 'https://treasury.ap.gov.in/cybertry/index1.php'
body = {'service': 'datewise',
'hdn': 2,
'edate': month_year,
'hoacc': date_month_year,
'bankname': date_month_year,
'mmyy': month_year
}
resp = requests.post(uri, data=body)
data_tables = pd.read_html(resp.text)
data = data_tables[0]
print(data.shape, date_month_year)
data.columns = data.iloc[0]
data.drop(0, inplace=True)
return data
In [5]:
def get_receipt_data_date_range(start_day, end_day):
'''date time format mm-dd-yyyy'''
combined_data = pd.DataFrame()
existing_files = os.listdir('temp_data/')
for date in pd.date_range(start_day, end_day):
date_month_year = date.strftime('%d-%m-%Y')
filename = '{0}.csv'.format(date_month_year)
if filename not in existing_files:
data = get_receipt_data(date)
data.to_csv('temp_data/{0}'.format(filename))
else:
data = pd.read_csv('temp_data/{0}'.format(filename))
combined_data = pd.concat([combined_data, data])
return combined_data
In [7]:
sample_data = get_receipt_data_date_range('04-01-2017', '05-01-2018')
In [8]:
hoas = pd.read_csv('hoa_2018.csv')
In [9]:
hoas.columns = hoas.iloc[0]
hoas.drop(0, inplace=True)
hoas.head()
Out[9]:
In [10]:
# Check if head of account information present.
hoas_intersection = set(sample_data['Head of Account'].astype(str).unique()) & set(hoas.hoa.astype(str).unique())
len(hoas_intersection) == len(sample_data['Head of Account'].unique())
Out[10]:
In [11]:
sample_data['Head of Account'].unique()
Out[11]:
In [12]:
sample_data.shape
Out[12]:
In [13]:
sample_data.columns
Out[13]:
In [14]:
sample_data.head()
Out[14]:
In [15]:
sample_data['Scroll Date'].unique()
Out[15]:
we need to convert the data based on months, but there are nan values present which we.. lets see if we can resolve that
In [16]:
sample_data[pd.isnull(sample_data['Scroll Date'])]['Amount'].shape
Out[16]:
In [18]:
sample_data[pd.isnull(sample_data['Scroll Date'])]['Amount'].sum()
Out[18]:
In [19]:
sample_data['Combined Date'] = sample_data['Scroll Date'].fillna(sample_data['Bank Date'])
In [20]:
sample_data['date'] = pd.to_datetime(sample_data['Combined Date'])
In [21]:
sample_data['month_year'] = sample_data.date.apply(lambda x: x.strftime('%Y-%m'))
In [22]:
sample_data['Amount'] = sample_data['Amount'].astype(float)
In [23]:
sample_data.groupby(['month_year'])['Amount'].sum()
Out[23]:
In [71]:
sample_data['day'] = sample_data.date.apply(lambda x: x.day)
In [72]:
sample_data[sample_data['month_year'] == '2016-04'].groupby(['day'])['Amount'].sum()
Out[72]:
In [ ]: